listings <- vroom("http://data.insideairbnb.com/turkey/marmara/istanbul/2020-06-28/data/listings.csv.gz")

1 Exploratory Data Analysis

1.1 What does the data set contain?

glimpse(listings)
## Rows: 23,728
## Columns: 106
## $ id                                           <dbl> 4826, 20815, 27271, 2827…
## $ listing_url                                  <chr> "https://www.airbnb.com/…
## $ scrape_id                                    <dbl> 2.02e+13, 2.02e+13, 2.02…
## $ last_scraped                                 <date> 2020-06-28, 2020-06-29,…
## $ name                                         <chr> "The Place", "The Bospho…
## $ summary                                      <chr> "My place is close to gr…
## $ space                                        <chr> "A double bed apartment …
## $ description                                  <chr> "My place is close to gr…
## $ experiences_offered                          <chr> "none", "none", "none", …
## $ neighborhood_overview                        <chr> NA, "The lovely neighbor…
## $ notes                                        <chr> NA, "The house may be su…
## $ transit                                      <chr> NA, "The city center, Ta…
## $ access                                       <chr> NA, "Our dear guests may…
## $ interaction                                  <chr> NA, "Depending on our ti…
## $ house_rules                                  <chr> NA, "- Windows facing th…
## $ thumbnail_url                                <lgl> NA, NA, NA, NA, NA, NA, …
## $ medium_url                                   <lgl> NA, NA, NA, NA, NA, NA, …
## $ picture_url                                  <chr> "https://a0.muscache.com…
## $ xl_picture_url                               <lgl> NA, NA, NA, NA, NA, NA, …
## $ host_id                                      <dbl> 6603, 78838, 117026, 121…
## $ host_url                                     <chr> "https://www.airbnb.com/…
## $ host_name                                    <chr> "Kaan", "Gülder", "Mutlu…
## $ host_since                                   <date> 2009-01-14, 2010-02-08,…
## $ host_location                                <chr> "Istanbul, Istanbul, Tur…
## $ host_about                                   <chr> "Hello...\r\nI am Kaan a…
## $ host_response_time                           <chr> "N/A", "N/A", "N/A", "N/…
## $ host_response_rate                           <chr> "N/A", "N/A", "N/A", "N/…
## $ host_acceptance_rate                         <chr> "N/A", "N/A", "50%", "10…
## $ host_is_superhost                            <lgl> FALSE, FALSE, FALSE, FAL…
## $ host_thumbnail_url                           <chr> "https://a0.muscache.com…
## $ host_picture_url                             <chr> "https://a0.muscache.com…
## $ host_neighbourhood                           <chr> "Üsküdar", "Beşiktaş", "…
## $ host_listings_count                          <dbl> 1, 2, 1, 20, 1, 1, 1, 2,…
## $ host_total_listings_count                    <dbl> 1, 2, 1, 20, 1, 1, 1, 2,…
## $ host_verifications                           <chr> "['email', 'phone', 'fac…
## $ host_has_profile_pic                         <lgl> TRUE, TRUE, TRUE, TRUE, …
## $ host_identity_verified                       <lgl> FALSE, FALSE, TRUE, FALS…
## $ street                                       <chr> "Istanbul Province, Ista…
## $ neighbourhood                                <chr> "Üsküdar", "Beşiktaş", "…
## $ neighbourhood_cleansed                       <chr> "Uskudar", "Besiktas", "…
## $ neighbourhood_group_cleansed                 <lgl> NA, NA, NA, NA, NA, NA, …
## $ city                                         <chr> "Istanbul Province", "Is…
## $ state                                        <chr> "Istanbul", NA, NA, NA, …
## $ zipcode                                      <chr> "34684", "34345", "34433…
## $ market                                       <chr> "Istanbul", "Istanbul", …
## $ smart_location                               <chr> "Istanbul Province, Turk…
## $ country_code                                 <chr> "TR", "TR", "TR", "TR", …
## $ country                                      <chr> "Turkey", "Turkey", "Tur…
## $ latitude                                     <dbl> 41.1, 41.1, 41.0, 41.0, …
## $ longitude                                    <dbl> 29.1, 29.0, 29.0, 29.0, …
## $ is_location_exact                            <lgl> FALSE, TRUE, FALSE, TRUE…
## $ property_type                                <chr> "Apartment", "Apartment"…
## $ room_type                                    <chr> "Entire home/apt", "Enti…
## $ accommodates                                 <dbl> 2, 3, 2, 5, 2, 3, 2, 2, …
## $ bathrooms                                    <dbl> 1.0, 1.0, 1.0, 1.0, 1.0,…
## $ bedrooms                                     <dbl> 0, 2, 1, 1, 2, 1, 1, 1, …
## $ beds                                         <dbl> 1, 2, 1, 3, 2, 1, 1, 1, …
## $ bed_type                                     <chr> "Real Bed", "Real Bed", …
## $ amenities                                    <chr> "{TV,\"Cable TV\",Intern…
## $ square_feet                                  <dbl> 700, NA, NA, 753, 700, 0…
## $ price                                        <chr> "$720.00", "$816.00", "$…
## $ weekly_price                                 <chr> NA, "$1,556.00", "$1,769…
## $ monthly_price                                <chr> NA, "$5,327.00", "$6,307…
## $ security_deposit                             <chr> NA, "$679.00", "$769.00"…
## $ cleaning_fee                                 <chr> NA, NA, "$308.00", "$77.…
## $ guests_included                              <dbl> 2, 4, 2, 2, 6, 1, 1, 2, …
## $ extra_people                                 <chr> "$178.00", "$240.00", "$…
## $ minimum_nights                               <dbl> 1, 365, 30, 3, 3, 3, 1, …
## $ maximum_nights                               <dbl> 730, 900, 90, 360, 60, 1…
## $ minimum_minimum_nights                       <dbl> 1, 365, 30, 3, 3, 3, 1, …
## $ maximum_minimum_nights                       <dbl> 1, 365, 30, 3, 3, 3, 1, …
## $ minimum_maximum_nights                       <dbl> 730, 900, 90, 360, 60, 1…
## $ maximum_maximum_nights                       <dbl> 730, 900, 90, 360, 60, 1…
## $ minimum_nights_avg_ntm                       <dbl> 1, 365, 30, 3, 3, 3, 1, …
## $ maximum_nights_avg_ntm                       <dbl> 730, 900, 90, 360, 60, 1…
## $ calendar_updated                             <chr> "38 months ago", "7 mont…
## $ has_availability                             <lgl> TRUE, TRUE, TRUE, TRUE, …
## $ availability_30                              <dbl> 30, 13, 28, 30, 28, 29, …
## $ availability_60                              <dbl> 60, 26, 58, 60, 58, 59, …
## $ availability_90                              <dbl> 90, 36, 80, 90, 88, 89, …
## $ availability_365                             <dbl> 365, 279, 289, 365, 88, …
## $ calendar_last_scraped                        <date> 2020-06-28, 2020-06-29,…
## $ number_of_reviews                            <dbl> 1, 41, 13, 0, 0, 0, 1, 1…
## $ number_of_reviews_ltm                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ first_review                                 <date> 2009-06-01, 2010-03-24,…
## $ last_review                                  <date> 2009-06-01, 2018-11-07,…
## $ review_scores_rating                         <dbl> 100, 90, 98, NA, NA, NA,…
## $ review_scores_accuracy                       <dbl> NA, 9, 10, NA, NA, NA, N…
## $ review_scores_cleanliness                    <dbl> NA, 9, 9, NA, NA, NA, NA…
## $ review_scores_checkin                        <dbl> NA, 10, 10, NA, NA, NA, …
## $ review_scores_communication                  <dbl> NA, 10, 10, NA, NA, NA, …
## $ review_scores_location                       <dbl> NA, 10, 10, NA, NA, NA, …
## $ review_scores_value                          <dbl> NA, 9, 10, NA, NA, NA, N…
## $ requires_license                             <lgl> FALSE, FALSE, FALSE, FAL…
## $ license                                      <lgl> NA, NA, NA, NA, NA, NA, …
## $ jurisdiction_names                           <lgl> NA, NA, NA, NA, NA, NA, …
## $ instant_bookable                             <lgl> FALSE, FALSE, FALSE, TRU…
## $ is_business_travel_ready                     <lgl> FALSE, FALSE, FALSE, FAL…
## $ cancellation_policy                          <chr> "flexible", "moderate", …
## $ require_guest_profile_picture                <lgl> FALSE, TRUE, FALSE, FALS…
## $ require_guest_phone_verification             <lgl> FALSE, FALSE, FALSE, FAL…
## $ calculated_host_listings_count               <dbl> 1, 2, 1, 19, 1, 1, 1, 2,…
## $ calculated_host_listings_count_entire_homes  <dbl> 1, 1, 1, 6, 1, 0, 0, 1, …
## $ calculated_host_listings_count_private_rooms <dbl> 0, 1, 0, 0, 0, 1, 1, 1, …
## $ calculated_host_listings_count_shared_rooms  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ reviews_per_month                            <dbl> 0.01, 0.33, 0.19, NA, NA…
skim(listings) 
Data summary
Name listings
Number of rows 23728
Number of columns 106
_______________________
Column type frequency:
character 46
Date 5
logical 16
numeric 39
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
listing_url 0 1.00 33 37 0 23728 0
name 54 1.00 1 108 0 22685 0
summary 3779 0.84 1 1000 0 17202 1
space 11361 0.52 1 1000 0 10575 0
description 2876 0.88 1 1000 0 18952 0
experiences_offered 0 1.00 4 4 0 1 0
neighborhood_overview 12658 0.47 1 1000 0 8867 0
notes 18474 0.22 1 1000 0 4221 0
transit 13660 0.42 1 1000 0 8147 0
access 16337 0.31 1 1000 0 5943 0
interaction 15019 0.37 1 1000 0 6607 0
house_rules 16407 0.31 1 1000 0 6306 0
picture_url 0 1.00 80 146 0 22996 0
host_url 0 1.00 38 43 0 14450 0
host_name 1 1.00 1 35 0 4907 0
host_location 83 1.00 2 105 0 775 0
host_about 11902 0.50 1 5717 0 6144 10
host_response_time 1 1.00 3 18 0 5 0
host_response_rate 1 1.00 2 4 0 60 0
host_acceptance_rate 1 1.00 2 4 0 85 0
host_thumbnail_url 1 1.00 55 106 0 14358 0
host_picture_url 1 1.00 57 109 0 14358 0
host_neighbourhood 15027 0.37 4 33 0 59 0
host_verifications 0 1.00 2 158 0 277 0
street 0 1.00 10 116 0 1180 0
neighbourhood 5377 0.77 4 15 0 15 0
neighbourhood_cleansed 0 1.00 4 13 0 39 0
city 773 0.97 2 69 0 641 0
state 397 0.98 1 58 0 293 0
zipcode 2422 0.90 1 43 0 388 0
market 0 1.00 7 21 0 3 0
smart_location 0 1.00 6 77 0 706 0
country_code 0 1.00 2 2 0 3 0
country 0 1.00 6 12 0 3 0
property_type 0 1.00 3 22 0 43 0
room_type 0 1.00 10 15 0 4 0
bed_type 0 1.00 5 13 0 5 0
amenities 0 1.00 2 1297 0 21255 0
price 0 1.00 5 10 0 501 0
weekly_price 21993 0.07 6 10 0 581 0
monthly_price 22031 0.07 7 11 0 628 0
security_deposit 15623 0.34 5 10 0 357 0
cleaning_fee 13660 0.42 5 9 0 294 0
extra_people 0 1.00 5 9 0 215 0
calendar_updated 0 1.00 5 14 0 104 0
cancellation_policy 0 1.00 6 27 0 6 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
last_scraped 0 1.00 2020-06-28 2020-06-30 2020-06-29 3
host_since 1 1.00 2009-01-14 2020-06-27 2017-08-27 3095
calendar_last_scraped 0 1.00 2020-06-28 2020-06-30 2020-06-29 3
first_review 12375 0.48 2009-06-01 2020-06-29 2019-04-12 2216
last_review 12375 0.48 2009-06-01 2020-06-29 2020-01-01 1424

Variable type: logical

skim_variable n_missing complete_rate mean count
thumbnail_url 23728 0 NaN :
medium_url 23728 0 NaN :
xl_picture_url 23728 0 NaN :
host_is_superhost 1 1 0.11 FAL: 21147, TRU: 2580
host_has_profile_pic 1 1 1.00 TRU: 23625, FAL: 102
host_identity_verified 1 1 0.16 FAL: 19951, TRU: 3776
neighbourhood_group_cleansed 23728 0 NaN :
is_location_exact 0 1 0.35 FAL: 15329, TRU: 8399
has_availability 0 1 1.00 TRU: 23728
requires_license 0 1 0.00 FAL: 23728
license 23728 0 NaN :
jurisdiction_names 23728 0 NaN :
instant_bookable 0 1 0.60 TRU: 14249, FAL: 9479
is_business_travel_ready 0 1 0.00 FAL: 23728
require_guest_profile_picture 0 1 0.01 FAL: 23571, TRU: 157
require_guest_phone_verification 0 1 0.01 FAL: 23526, TRU: 202

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1.00 2.91e+07 1.31e+07 4.83e+03 2.10e+07 3.40e+07 3.97e+07 4.40e+07 ▂▂▂▃▇
scrape_id 0 1.00 2.02e+13 0.00e+00 2.02e+13 2.02e+13 2.02e+13 2.02e+13 2.02e+13 ▁▁▇▁▁
host_id 0 1.00 1.49e+08 1.16e+08 6.60e+03 3.29e+07 1.48e+08 2.59e+08 3.52e+08 ▇▂▃▅▃
host_listings_count 1 1.00 2.43e+01 2.24e+02 0.00e+00 1.00e+00 1.00e+00 4.00e+00 3.77e+03 ▇▁▁▁▁
host_total_listings_count 1 1.00 2.43e+01 2.24e+02 0.00e+00 1.00e+00 1.00e+00 4.00e+00 3.77e+03 ▇▁▁▁▁
latitude 0 1.00 4.10e+01 5.00e-02 4.08e+01 4.10e+01 4.10e+01 4.10e+01 4.15e+01 ▁▇▁▁▁
longitude 0 1.00 2.90e+01 1.30e-01 2.80e+01 2.90e+01 2.90e+01 2.90e+01 2.99e+01 ▁▁▇▁▁
accommodates 0 1.00 3.21e+00 2.25e+00 1.00e+00 2.00e+00 2.00e+00 4.00e+00 1.60e+01 ▇▁▁▁▁
bathrooms 86 1.00 1.21e+00 1.04e+00 0.00e+00 1.00e+00 1.00e+00 1.00e+00 5.00e+01 ▇▁▁▁▁
bedrooms 173 0.99 1.39e+00 1.44e+00 0.00e+00 1.00e+00 1.00e+00 2.00e+00 5.00e+01 ▇▁▁▁▁
beds 698 0.97 2.05e+00 2.04e+00 0.00e+00 1.00e+00 1.00e+00 2.00e+00 7.70e+01 ▇▁▁▁▁
square_feet 23492 0.01 6.05e+02 1.21e+03 0.00e+00 7.00e+01 5.38e+02 8.07e+02 1.74e+04 ▇▁▁▁▁
guests_included 0 1.00 1.40e+00 1.09e+00 1.00e+00 1.00e+00 1.00e+00 1.00e+00 1.60e+01 ▇▁▁▁▁
minimum_nights 0 1.00 4.53e+00 2.76e+01 1.00e+00 1.00e+00 1.00e+00 3.00e+00 1.12e+03 ▇▁▁▁▁
maximum_nights 0 1.00 9.13e+04 1.39e+07 1.00e+00 6.00e+01 1.12e+03 1.12e+03 2.15e+09 ▇▁▁▁▁
minimum_minimum_nights 0 1.00 4.41e+00 2.68e+01 1.00e+00 1.00e+00 1.00e+00 2.00e+00 1.12e+03 ▇▁▁▁▁
maximum_minimum_nights 0 1.00 4.71e+00 2.86e+01 1.00e+00 1.00e+00 1.00e+00 3.00e+00 1.12e+03 ▇▁▁▁▁
minimum_maximum_nights 0 1.00 9.28e+02 9.68e+03 1.00e+00 3.60e+02 1.12e+03 1.12e+03 1.00e+06 ▇▁▁▁▁
maximum_maximum_nights 0 1.00 9.29e+02 9.68e+03 1.00e+00 3.60e+02 1.12e+03 1.12e+03 1.00e+06 ▇▁▁▁▁
minimum_nights_avg_ntm 0 1.00 4.51e+00 2.70e+01 1.00e+00 1.00e+00 1.00e+00 3.00e+00 1.12e+03 ▇▁▁▁▁
maximum_nights_avg_ntm 0 1.00 9.29e+02 9.68e+03 1.00e+00 3.60e+02 1.12e+03 1.12e+03 1.00e+06 ▇▁▁▁▁
availability_30 0 1.00 2.21e+01 1.21e+01 0.00e+00 1.70e+01 2.90e+01 3.00e+01 3.00e+01 ▂▁▁▁▇
availability_60 0 1.00 4.53e+01 2.38e+01 0.00e+00 4.10e+01 5.90e+01 6.00e+01 6.00e+01 ▂▁▁▁▇
availability_90 0 1.00 6.90e+01 3.51e+01 0.00e+00 6.60e+01 8.90e+01 9.00e+01 9.00e+01 ▂▁▁▁▇
availability_365 0 1.00 2.28e+02 1.47e+02 0.00e+00 8.90e+01 3.02e+02 3.65e+02 3.65e+02 ▃▂▂▁▇
number_of_reviews 0 1.00 7.87e+00 2.32e+01 0.00e+00 0.00e+00 0.00e+00 4.00e+00 3.45e+02 ▇▁▁▁▁
number_of_reviews_ltm 0 1.00 3.04e+00 7.47e+00 0.00e+00 0.00e+00 0.00e+00 2.00e+00 1.08e+02 ▇▁▁▁▁
review_scores_rating 12978 0.45 9.13e+01 1.40e+01 2.00e+01 9.00e+01 9.60e+01 1.00e+02 1.00e+02 ▁▁▁▁▇
review_scores_accuracy 12991 0.45 9.29e+00 1.42e+00 2.00e+00 9.00e+00 1.00e+01 1.00e+01 1.00e+01 ▁▁▁▁▇
review_scores_cleanliness 12988 0.45 9.06e+00 1.51e+00 2.00e+00 9.00e+00 1.00e+01 1.00e+01 1.00e+01 ▁▁▁▂▇
review_scores_checkin 12991 0.45 9.52e+00 1.28e+00 2.00e+00 1.00e+01 1.00e+01 1.00e+01 1.00e+01 ▁▁▁▁▇
review_scores_communication 12987 0.45 9.55e+00 1.24e+00 2.00e+00 1.00e+01 1.00e+01 1.00e+01 1.00e+01 ▁▁▁▁▇
review_scores_location 12991 0.45 9.44e+00 1.23e+00 2.00e+00 9.00e+00 1.00e+01 1.00e+01 1.00e+01 ▁▁▁▁▇
review_scores_value 12993 0.45 9.19e+00 1.40e+00 2.00e+00 9.00e+00 1.00e+01 1.00e+01 1.00e+01 ▁▁▁▁▇
calculated_host_listings_count 0 1.00 5.86e+00 1.65e+01 1.00e+00 1.00e+00 2.00e+00 5.00e+00 1.76e+02 ▇▁▁▁▁
calculated_host_listings_count_entire_homes 0 1.00 2.81e+00 6.37e+00 0.00e+00 0.00e+00 1.00e+00 2.00e+00 6.60e+01 ▇▁▁▁▁
calculated_host_listings_count_private_rooms 0 1.00 2.46e+00 1.51e+01 0.00e+00 0.00e+00 1.00e+00 1.00e+00 1.75e+02 ▇▁▁▁▁
calculated_host_listings_count_shared_rooms 0 1.00 9.00e-02 6.00e-01 0.00e+00 0.00e+00 0.00e+00 0.00e+00 1.10e+01 ▇▁▁▁▁
reviews_per_month 12375 0.48 7.10e-01 9.00e-01 1.00e-02 1.30e-01 3.30e-01 9.50e-01 9.20e+00 ▇▁▁▁▁

From our glimpse into the data frame, we see that there are 106 columns with a total of 23,728 rows. However of these 106 columns, skim() shows us that only 39 are of the type “numeric”. These include variables such as “bedrooms”, “square feet”, “latitude” and “longitude”. Investigating further, we see that a lot of the columns don’t add anything of value in terms of analysis (e.g. id, listing_url, scrape_id) and will therefore be dropped later on.

However, we also see that some of the columns you’d expect to be numeric (e.g. “price”, “cleaning fee”) are actually served as strings - we will therefore need to cast these for use in our analysis later on.

1.2 Categorical variables

Checking for factor variables in our dataset, we first dive into the review section of hosts - surely AirBnB must have implemented a review system of 1-10?

listings %>%
  select(review_scores_value) %>%
  filter(review_scores_value != "NA") %>%
  group_by(review_scores_value) %>%
  count() %>% 
  ggplot(., aes(x = review_scores_value, y = n)) + geom_col()

Manipulating our dataframe to show the range of review scores, we see that AirBnB indeed has a 1-10 rating system, and judging from the scores given it seems that Istanbul has some great hosts! The reason why we don’t see any 1 scores could perhaps be that AirBnB removes hosts who receive such a score very quickly.

Other categorical values seem to be variables such as * maximum_nights (the max period you can rent an AirBnB is 365 days) * zipcode (Istanbul has a finite amount of zipcodes) * neighbourhood (Istanbul has a finite amount of neighbourhoods)

1.3 Correlation between variables

A few interesting variables were picked out of the dataset and evaluated using GGPairs:

ggpairs_data <- data.frame(listings$price, listings$bedrooms, listings$neighbourhood, listings$accommodates, listings$bathrooms, listings$square_feet)
ggpairs(ggpairs_data, cardinality_threshold = NULL)

From this plot, we see that the data is scattered all over, presuming that we can have some trouble building a model that is able to predict the price for a 4 night stay in Istanbul. We see that bathrooms and bedrooms have near linear relationship, but that is pretty much the only strong correlation we see. Getting the predicted price might end up being tricky!

2 Cleaning the dataset

2.0.1 Removing unneccesary variables

As we saw in our glimpse of the data set, there are a few variables that have no relevancy, or at least cause a lot of noise when trying to manipulate the data. Let’s remove some of the less important columns:

#Keep relevant variables
listings_cleaned <- listings %>% 
  select(!c(id, scrape_id, last_scraped, experiences_offered, neighborhood_overview, thumbnail_url, thumbnail_url, medium_url, picture_url, xl_picture_url, host_id, host_url, host_name, host_location, host_about, host_thumbnail_url, host_picture_url, country, country_code, calendar_updated, has_availability, calendar_last_scraped, jurisdiction_names))

2.0.2 Casting price

Now let’s turn the faulty character variables into the numerics they should be representing, starting with price

listings_cleaned <- listings_cleaned %>%
  mutate(price = parse_number(price))

typeof(listings_cleaned$price)
## [1] "double"

2.0.3 Analyzing cleaning_fee

Now, let’s turn our focus to the cleaning_fee variable

#First we change it into a numerical variable

listings_cleaned <- listings_cleaned %>%
  mutate(cleaning_fee = parse_number(cleaning_fee))

skim(listings_cleaned$cleaning_fee) %>%
  kable() %>%
  kable_styling()
skim_type skim_variable n_missing complete_rate numeric.mean numeric.sd numeric.p0 numeric.p25 numeric.p50 numeric.p75 numeric.p100 numeric.hist
numeric data 13660 0.424 127 178 0 0 80 192 4569 ▇▁▁▁▁

We see that there’s 13660 rows / ads missing a value for cleaning_fee. This most likely indicates that the ad simply has no cleaning fee associated with renting the apartment.

Let’s turn these missing values into zeroes, which more accurately convey what the dataset is trying to tell us:

listings_cleaned <- listings_cleaned %>%
  mutate(cleaning_fee = case_when(
    is.na(cleaning_fee) ~ 0, #Fill in a 0 when there is a N/A value
    TRUE ~ cleaning_fee
  ))

#Confirm there's no more missing values
skim(listings_cleaned$cleaning_fee) %>%
  kable() %>%
  kable_styling()
skim_type skim_variable n_missing complete_rate numeric.mean numeric.sd numeric.p0 numeric.p25 numeric.p50 numeric.p75 numeric.p100 numeric.hist
numeric data 0 1 54.1 132 0 0 0 60 4569 ▇▁▁▁▁
#Confirmed!

2.0.4 A deeper look into property_type

Let’s turn our attention to property_type. First we count how many categories make up the variable’s frequency

property_count <- listings_cleaned %>%
  group_by(property_type) %>%
  count() %>%
  arrange(desc(n))

property_count %>%
  kable() %>%
  kable_styling()
property_type n
Apartment 14958
Serviced apartment 1700
House 1564
Boutique hotel 1113
Townhouse 692
Condominium 629
Aparthotel 590
Bed and breakfast 585
Hotel 545
Loft 436
Villa 328
Hostel 150
Other 78
Casa particular (Cuba) 63
Tiny house 55
Guesthouse 54
Guest suite 32
Earth house 18
Farm stay 17
Yurt 17
Cottage 13
Camper/RV 11
Boat 10
Castle 10
Chalet 9
Nature lodge 8
Treehouse 7
Pension (South Korea) 5
Tent 5
Bungalow 4
Lighthouse 4
Houseboat 3
Campsite 2
Dome house 2
Hut 2
Pousada (Portugal) 2
Barn 1
Cabin 1
Cave 1
Heritage hotel (India) 1
Island 1
Vacation home 1
Windmill 1

We see that the top 4 most common property types are:

  1. Apartment
  2. Serviced apartment
  3. House
  4. Boutique hotel

These four make up…

totalproperty_count <- listings_cleaned %>%
  select(property_type) %>%
  count() #Count total properties in the dataset

property_count %>%
  head(4) %>% #Choose the top 4 (apt, serviced apt, house, boutique)
  arrange(desc(n)) %>%
  ungroup() %>% #Prime for summarization  
  summarise(sum4 = sum(n)) %>% #Get sum of the top 4 property type
  mutate(proportion4 = sum4/totalproperty_count$n) #Calculate the proportion 
sum4proportion4
193350.815

A whole 81.5% as seen from the data manipulation above. Let’s now create a simplified version of the property_type variable with 5 categories:

  • Apartment
  • Serviced Apartment
  • House
  • Boutique Hotel
  • Other
listings_cleaned <- listings_cleaned %>%
  mutate(prop_type_simplified = case_when(
    property_type %in% c(
      "Apartment",
      "Serviced apartment", 
      "House",
      "Boutique hotel") ~ property_type, #When the property type matches one of these four, keep the value  
    TRUE ~ "Other" #Otherwise turn it into "Other" 
  ))

#Checking that our code is correct 
listings_cleaned %>%
  count(property_type, prop_type_simplified) %>%
  arrange(desc(n)) %>%
  kable() %>%
  kable_styling()
property_type prop_type_simplified n
Apartment Apartment 14958
Serviced apartment Serviced apartment 1700
House House 1564
Boutique hotel Boutique hotel 1113
Townhouse Other 692
Condominium Other 629
Aparthotel Other 590
Bed and breakfast Other 585
Hotel Other 545
Loft Other 436
Villa Other 328
Hostel Other 150
Other Other 78
Casa particular (Cuba) Other 63
Tiny house Other 55
Guesthouse Other 54
Guest suite Other 32
Earth house Other 18
Farm stay Other 17
Yurt Other 17
Cottage Other 13
Camper/RV Other 11
Boat Other 10
Castle Other 10
Chalet Other 9
Nature lodge Other 8
Treehouse Other 7
Pension (South Korea) Other 5
Tent Other 5
Bungalow Other 4
Lighthouse Other 4
Houseboat Other 3
Campsite Other 2
Dome house Other 2
Hut Other 2
Pousada (Portugal) Other 2
Barn Other 1
Cabin Other 1
Cave Other 1
Heritage hotel (India) Other 1
Island Other 1
Vacation home Other 1
Windmill Other 1
#Checks out! 

2.0.5 minimum_nights and filtering away non-travellers

Let’s now turn our attention to the minimum_nights variable

listings_cleaned %>% 
  group_by(minimum_nights) %>%
  count() %>%
  arrange(desc(n)) %>%
  kable() %>%
  kable_styling()
minimum_nights n
1 13228
2 4511
3 2682
7 702
5 701
4 490
30 324
10 185
15 179
6 138
14 69
20 61
28 60
90 52
60 42
180 30
365 27
8 26
9 21
13 21
27 19
25 18
29 16
120 15
31 12
21 9
360 9
12 8
100 6
150 6
80 5
1000 5
200 4
11 3
17 3
40 3
45 3
50 3
19 2
65 2
88 2
250 2
500 2
24 1
26 1
32 1
35 1
59 1
61 1
77 1
85 1
96 1
118 1
140 1
148 1
183 1
210 1
300 1
600 1
720 1
730 1
800 1
900 1
999 1
1125 1

The most common values (top 5) are:

  1. 1 night
  2. 2 nights
  3. 3 nights
  4. 7 nights
  5. 5 nights

Plotting this as a ggplot would result in a very ugly diagram. We see that using AirBnB to stay for 1 night in Istanbul seems very popular representing more than 60% of the top 5 booking durations.

There could be several reasons why this is the case, but our hypothesis is that:

  • Tourists jump from one AirBnB to the next as they explore various parts of this vibrant, cultural city e.g. living near the Blue Mosque one day, moving to Taksim Square the next before crossing the Bosphorous Strait into the Asian part of Istanbul.

We will now filter the minimum_nights so that only observations <= 4 are included in the dataset

listings_cleaned <- listings_cleaned %>%
  filter(minimum_nights <= 4)

3 Mapping

leaflet(data = filter(listings_cleaned, minimum_nights <= 4)) %>% 
  addProviderTiles("OpenStreetMap.Mapnik") %>% 
  addCircleMarkers(lng = ~longitude, 
                   lat = ~latitude, 
                   radius = 1, 
                   fillColor = "blue", 
                   fillOpacity = 0.4, 
                   popup = ~listing_url,
                   label = ~property_type)

4 Regression Analysis

listings_cleaned <- listings_cleaned %>%
  mutate(extra_people = parse_number(extra_people))
max(listings_cleaned$price)
## [1] 76922
total_cost_1 <- listings_cleaned %>%
  filter(guests_included == 1) %>%
  mutate(cost = (4 * price + 4 * extra_people + cleaning_fee))

total_cost_2 <- listings_cleaned %>%
  filter(guests_included >= 2) %>%
  mutate(cost = (4 * price + cleaning_fee))

listings_cleaned <- full_join(total_cost_1, total_cost_2, copy = FALSE)

listings_cleaned <- listings_cleaned %>%
  mutate(price_4_nights = cost)

options("scipen"=100, "digits"=4)

ggplot(listings_cleaned, aes(x=price_4_nights)) + geom_histogram() + labs(title = "Price of staying at an Airbnb location for 4 nights", x = "Costs of 4-night stays", y = "No. of couples")

ggplot(listings_cleaned, aes(x=price_4_nights)) + geom_histogram() + scale_x_log10() +labs(title = "Price of staying at an Airbnb location for 4 nights", x = "Costs of 4-night stays", y = "No. of couples") 

For the regression model we would think of the following explanatory variables:

  • Neighborhood
  • Bedrooms
  • Accomodates
  • Square feet
  • Property type

These are based on our own experience when we’re looking for places to stay. We have found that the above variables usually play a big part in determining the price of the listing.

model1 <- lm(price_4_nights ~ number_of_reviews + prop_type_simplified + review_scores_rating, data = listings_cleaned)

summary(model1)
## 
## Call:
## lm(formula = price_4_nights ~ number_of_reviews + prop_type_simplified + 
##     review_scores_rating, data = listings_cleaned)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -2641  -1087   -636     81 305432 
## 
## Coefficients:
##                                        Estimate Std. Error t value    Pr(>|t|)
## (Intercept)                            2665.804    493.831    5.40 0.000000069
## number_of_reviews                        -0.744      2.238   -0.33       0.740
## prop_type_simplifiedBoutique hotel      452.912    327.261    1.38       0.166
## prop_type_simplifiedHouse               -21.031    322.412   -0.07       0.948
## prop_type_simplifiedOther               246.776    208.227    1.19       0.236
## prop_type_simplifiedServiced apartment  439.449    280.231    1.57       0.117
## review_scores_rating                    -10.772      5.261   -2.05       0.041
##                                           
## (Intercept)                            ***
## number_of_reviews                         
## prop_type_simplifiedBoutique hotel        
## prop_type_simplifiedHouse                 
## prop_type_simplifiedOther                 
## prop_type_simplifiedServiced apartment    
## review_scores_rating                   *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7190 on 9632 degrees of freedom
##   (11272 observations deleted due to missingness)
## Multiple R-squared:  0.0011, Adjusted R-squared:  0.00048 
## F-statistic: 1.77 on 6 and 9632 DF,  p-value: 0.101
#According to model1, if the review score rating of an Airbnb location rises by 1, the price of a couple staying at that location for 4 nights would decrease by 10.77.

#According to model1, an increase in the number of boutique hotels by 1 would raise the cost of a 4-night stay for a couple by 452.91. An equivalent increase for serviced apartments would raise the cost by 439.45. Houses are the only type of property which have a negative relationship with price, decreasing the cost by 21.03 for each additional house. All other property types such as apartments, townhouses, condominiums, aparthotels, bed and breakfasts, and lofts, have counts positively correlated with the cost of a 4-night stay for a couple.

model2 <- lm(price_4_nights ~ number_of_reviews + room_type + prop_type_simplified + review_scores_rating, data = listings_cleaned)
summary(model2)
## 
## Call:
## lm(formula = price_4_nights ~ number_of_reviews + room_type + 
##     prop_type_simplified + review_scores_rating, data = listings_cleaned)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -3430   -944   -539     43 304411 
## 
## Coefficients:
##                                        Estimate Std. Error t value     Pr(>|t|)
## (Intercept)                             2863.32     495.28    5.78 0.0000000076
## number_of_reviews                         -3.01       2.26   -1.33        0.184
## room_typeHotel room                      782.09     374.01    2.09        0.037
## room_typePrivate room                   -879.57     165.56   -5.31 0.0000001104
## room_typeShared room                   -1460.05     638.07   -2.29        0.022
## prop_type_simplifiedBoutique hotel       381.81     366.80    1.04        0.298
## prop_type_simplifiedHouse                 48.55     321.98    0.15        0.880
## prop_type_simplifiedOther                276.95     216.60    1.28        0.201
## prop_type_simplifiedServiced apartment   173.57     285.19    0.61        0.543
## review_scores_rating                      -9.34       5.26   -1.78        0.076
##                                           
## (Intercept)                            ***
## number_of_reviews                         
## room_typeHotel room                    *  
## room_typePrivate room                  ***
## room_typeShared room                   *  
## prop_type_simplifiedBoutique hotel        
## prop_type_simplifiedHouse                 
## prop_type_simplifiedOther                 
## prop_type_simplifiedServiced apartment    
## review_scores_rating                   .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7170 on 9629 degrees of freedom
##   (11272 observations deleted due to missingness)
## Multiple R-squared:  0.00567,    Adjusted R-squared:  0.00474 
## F-statistic:  6.1 on 9 and 9629 DF,  p-value: 0.0000000137
#From this new regression model, we see an increase in the adjusted R^2 value meaning that room_type indeed seems to be a predictor of the cost for 4 nights. This is also seen by the t-values of the various room types - everyone of them is significant.   

5 Further variables/question to explore

In this part, we will further investigate the relationship between several variables to extend our analysis. First, an overview of the relationship we investigate will be given. Then, the relationship will be investigated, and finally, a written conclusion will be presented.

5.1 Can we use the number of bathrooms, bedrooms, beds and the size of the house to predict the price of a four night stay?

To start, we will investigate whether the number of bathrooms, bedrooms, beds or the size of the house are significant predictors of the price for a four night stay as it was calculated in the previous chapter.

mymodela <- lm(price_4_nights ~ bathrooms, data = listings_cleaned)
msummary(mymodela)
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)   1373.6       81.9    16.8 <0.0000000000000002 ***
## bathrooms      545.6       51.1    10.7 <0.0000000000000002 ***
## 
## Residual standard error: 7860 on 20829 degrees of freedom
##   (80 observations deleted due to missingness)
## Multiple R-squared:  0.00545,    Adjusted R-squared:  0.0054 
## F-statistic:  114 on 1 and 20829 DF,  p-value: <0.0000000000000002

The t value of 10.9 tells us that the number of bathrooms is a significant predictor of the price for four nights. Our model estimates every bathroom adds about 556.1$ to the price of a four nights stay.

mymodelb <- lm(price_4_nights ~ bedrooms, data = listings_cleaned)
msummary(mymodelb)
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)   1543.5       74.3   20.77 <0.0000000000000002 ***
## bedrooms       360.2       36.9    9.75 <0.0000000000000002 ***
## 
## Residual standard error: 7880 on 20753 degrees of freedom
##   (156 observations deleted due to missingness)
## Multiple R-squared:  0.00456,    Adjusted R-squared:  0.00451 
## F-statistic: 95.1 on 1 and 20753 DF,  p-value: <0.0000000000000002

The t value of 10.0 tells us that the number of bedrooms is a significant predictor of the price for four nights. Our model estimates every bedroom adds about 371.4$ to the price of a four nights stay.

mymodelc <- lm(price_4_nights ~ beds, data = listings_cleaned)
msummary(mymodelc)
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)   1338.8       74.7    17.9 <0.0000000000000002 ***
## beds           315.1       25.8    12.2 <0.0000000000000002 ***
## 
## Residual standard error: 7600 on 20259 degrees of freedom
##   (650 observations deleted due to missingness)
## Multiple R-squared:  0.0073, Adjusted R-squared:  0.00725 
## F-statistic:  149 on 1 and 20259 DF,  p-value: <0.0000000000000002

The t value of 12.6 tells us that the number of beds is a significant predictor of the price for four nights. Our model estimates every bed adds about 326.7$ to the price of a four nights stay.

mymodeld <- lm(price_4_nights ~ accommodates, data = listings_cleaned)
msummary(mymodeld)
##              Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)     571.0       93.9    6.08         0.0000000012 ***
## accommodates    460.6       24.3   18.95 < 0.0000000000000002 ***
## 
## Residual standard error: 7800 on 20909 degrees of freedom
## Multiple R-squared:  0.0169, Adjusted R-squared:  0.0168 
## F-statistic:  359 on 1 and 20909 DF,  p-value: <0.0000000000000002

The t value of 19.79 tells us that the amount of people an AirBnB can accommodate is a significant predictor of the price for four nights. Our model estimates every person an apartment can accommodate adds about 481.7$ to the price of a four nights stay.

mymodele <- lm(price_4_nights ~ bathrooms + bedrooms + beds + accommodates, data = listings_cleaned)
msummary(mymodele)
##              Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)    409.20      99.56    4.11            0.0000397 ***
## bathrooms      338.56      71.01    4.77            0.0000019 ***
## bedrooms      -120.36      55.80   -2.16                0.031 *  
## beds             8.28      35.41    0.23                0.815    
## accommodates   412.02      31.04   13.27 < 0.0000000000000002 ***
## 
## Residual standard error: 7580 on 20109 degrees of freedom
##   (797 observations deleted due to missingness)
## Multiple R-squared:  0.0176, Adjusted R-squared:  0.0174 
## F-statistic: 90.3 on 4 and 20109 DF,  p-value: <0.0000000000000002

When running our model for all the variables at the same time. At first sight, we find that the amount of beds is not a significant predictor anymore and that extra bedrooms actually decrease the price of the four night stay. It is important to note that our multi-collinearity test works best when there is zero correlation between the explanatory variables. No test is needed to see that this is not the case in our example, the amount of people an apartment can accommodate and the number of bedrooms is obviously correlated to the number of beds and so is the amount of bathrooms. It is for this reason that we cannot draw conclusions from the above linear model without making appropriate adaptations.

5.2 Do superhosts command a pricing premium?

Now, lets investigate whether superhosts command a pricing premium after controlling for other variables. The way we would control for other (confounding) variables is to include them in our regression model and see if the variable we want to investigate is still a significant predictor. The problem is we will never know if we have taken care of all the variables that we need to (which is called residual confounding) but we try our best to get an answer that is as close to correct as possible.

Lets first look at which variables are available in our cleaned dataset:

glimpse(listings_cleaned) 
## Rows: 20,911
## Columns: 87
## $ listing_url                                  <chr> "https://www.airbnb.com/…
## $ name                                         <chr> "↪ Istanbul, Your second…
## $ summary                                      <chr> NA, NA, NA, NA, NA, "Hi!…
## $ space                                        <chr> "There are many interest…
## $ description                                  <chr> "There are many interest…
## $ notes                                        <chr> NA, NA, NA, NA, "Please …
## $ transit                                      <chr> "There are bus stops to …
## $ access                                       <chr> NA, NA, NA, NA, "You wil…
## $ interaction                                  <chr> NA, NA, NA, NA, "The are…
## $ house_rules                                  <chr> "Non smoker or (Email hi…
## $ host_since                                   <date> 2010-05-16, 2010-05-25,…
## $ host_response_time                           <chr> "N/A", "N/A", "N/A", "N/…
## $ host_response_rate                           <chr> "N/A", "N/A", "N/A", "N/…
## $ host_acceptance_rate                         <chr> "N/A", "N/A", "N/A", "N/…
## $ host_is_superhost                            <lgl> FALSE, FALSE, FALSE, FAL…
## $ host_neighbourhood                           <chr> "Beyoglu", "Taksim", "Ka…
## $ host_listings_count                          <dbl> 1, 1, 1, 1, 6, 1, 1, 1, …
## $ host_total_listings_count                    <dbl> 1, 1, 1, 1, 6, 1, 1, 1, …
## $ host_verifications                           <chr> "['email', 'phone', 'fac…
## $ host_has_profile_pic                         <lgl> TRUE, TRUE, FALSE, TRUE,…
## $ host_identity_verified                       <lgl> FALSE, TRUE, FALSE, FALS…
## $ street                                       <chr> "Istanbul, Istanbul, Tur…
## $ neighbourhood                                <chr> "Beyoglu", "Beyoglu", "K…
## $ neighbourhood_cleansed                       <chr> "Beyoglu", "Beyoglu", "B…
## $ neighbourhood_group_cleansed                 <lgl> NA, NA, NA, NA, NA, NA, …
## $ city                                         <chr> "Istanbul", "Istanbul Pr…
## $ state                                        <chr> "Istanbul", NA, NA, NA, …
## $ zipcode                                      <chr> "34445", "34433", "34425…
## $ market                                       <chr> "Istanbul", "Istanbul", …
## $ smart_location                               <chr> "Istanbul, Turkey", "Ist…
## $ latitude                                     <dbl> 41.05, 41.03, 41.03, 41.…
## $ longitude                                    <dbl> 28.95, 28.98, 28.98, 29.…
## $ is_location_exact                            <lgl> FALSE, FALSE, FALSE, FAL…
## $ property_type                                <chr> "Apartment", "Apartment"…
## $ room_type                                    <chr> "Private room", "Private…
## $ accommodates                                 <dbl> 3, 2, 1, 2, 3, 1, 1, 2, …
## $ bathrooms                                    <dbl> 1.0, 1.0, 1.0, NA, 1.0, …
## $ bedrooms                                     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, …
## $ beds                                         <dbl> 1, 1, 1, NA, 2, 1, NA, 1…
## $ bed_type                                     <chr> "Real Bed", "Real Bed", …
## $ amenities                                    <chr> "{TV,Wifi,Kitchen,Breakf…
## $ square_feet                                  <dbl> 0, NA, NA, NA, NA, NA, N…
## $ price                                        <dbl> 343, 768, 473, 514, 514,…
## $ weekly_price                                 <chr> "$2,297.00", "$1,108.00"…
## $ monthly_price                                <chr> NA, "$3,077.00", NA, NA,…
## $ security_deposit                             <chr> NA, NA, NA, NA, "$686.00…
## $ cleaning_fee                                 <dbl> 0, 154, 0, 0, 309, 69, 0…
## $ guests_included                              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, …
## $ extra_people                                 <dbl> 274, 0, 0, 0, 0, 0, 0, 1…
## $ minimum_nights                               <dbl> 3, 1, 1, 1, 3, 3, 1, 2, …
## $ maximum_nights                               <dbl> 15, 2, 730, 730, 3, 30, …
## $ minimum_minimum_nights                       <dbl> 3, 1, 1, 1, 3, 3, 1, 2, …
## $ maximum_minimum_nights                       <dbl> 3, 1, 1, 1, 3, 3, 1, 2, …
## $ minimum_maximum_nights                       <dbl> 15, 2, 730, 730, 3, 30, …
## $ maximum_maximum_nights                       <dbl> 15, 2, 730, 730, 3, 30, …
## $ minimum_nights_avg_ntm                       <dbl> 3, 1, 1, 1, 3, 3, 1, 2, …
## $ maximum_nights_avg_ntm                       <dbl> 15, 2, 730, 730, 3, 30, …
## $ availability_30                              <dbl> 29, 30, 30, 0, 25, 30, 3…
## $ availability_60                              <dbl> 59, 60, 60, 0, 55, 60, 6…
## $ availability_90                              <dbl> 89, 90, 90, 0, 85, 90, 9…
## $ availability_365                             <dbl> 364, 365, 365, 0, 360, 3…
## $ number_of_reviews                            <dbl> 0, 1, 0, 0, 9, 0, 0, 0, …
## $ number_of_reviews_ltm                        <dbl> 0, 0, 0, 0, 1, 0, 0, 0, …
## $ first_review                                 <date> NA, 2010-06-14, NA, NA,…
## $ last_review                                  <date> NA, 2010-06-14, NA, NA,…
## $ review_scores_rating                         <dbl> NA, 80, NA, NA, 93, NA, …
## $ review_scores_accuracy                       <dbl> NA, NA, NA, NA, 9, NA, N…
## $ review_scores_cleanliness                    <dbl> NA, NA, NA, NA, 9, NA, N…
## $ review_scores_checkin                        <dbl> NA, NA, NA, NA, 9, NA, N…
## $ review_scores_communication                  <dbl> NA, NA, NA, NA, 9, NA, N…
## $ review_scores_location                       <dbl> NA, NA, NA, NA, 8, NA, N…
## $ review_scores_value                          <dbl> NA, NA, NA, NA, 8, NA, N…
## $ requires_license                             <lgl> FALSE, FALSE, FALSE, FAL…
## $ license                                      <lgl> NA, NA, NA, NA, NA, NA, …
## $ instant_bookable                             <lgl> FALSE, TRUE, FALSE, FALS…
## $ is_business_travel_ready                     <lgl> FALSE, FALSE, FALSE, FAL…
## $ cancellation_policy                          <chr> "strict_14_with_grace_pe…
## $ require_guest_profile_picture                <lgl> FALSE, TRUE, FALSE, FALS…
## $ require_guest_phone_verification             <lgl> FALSE, FALSE, FALSE, FAL…
## $ calculated_host_listings_count               <dbl> 1, 1, 1, 1, 6, 1, 1, 1, …
## $ calculated_host_listings_count_entire_homes  <dbl> 0, 0, 0, 0, 4, 0, 0, 1, …
## $ calculated_host_listings_count_private_rooms <dbl> 1, 1, 1, 1, 1, 1, 1, 0, …
## $ calculated_host_listings_count_shared_rooms  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ reviews_per_month                            <dbl> NA, 0.01, NA, NA, 0.08, …
## $ prop_type_simplified                         <chr> "Apartment", "Apartment"…
## $ cost                                         <dbl> 2468, 3226, 1892, 2056, …
## $ price_4_nights                               <dbl> 2468, 3226, 1892, 2056, …
skim(listings_cleaned) 
Data summary
Name listings_cleaned
Number of rows 20911
Number of columns 87
_______________________
Column type frequency:
character 32
Date 3
logical 11
numeric 41
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
listing_url 0 1.00 33 37 0 20911 0
name 51 1.00 1 77 0 19987 0
summary 3320 0.84 1 1000 0 15014 1
space 10102 0.52 1 1000 0 9159 0
description 2529 0.88 1 1000 0 16566 0
notes 16364 0.22 1 1000 0 3647 0
transit 12093 0.42 1 1000 0 7072 0
access 14476 0.31 1 1000 0 5142 0
interaction 13267 0.37 1 1000 0 5777 0
house_rules 14607 0.30 1 1000 0 5453 0
host_response_time 1 1.00 3 18 0 5 0
host_response_rate 1 1.00 2 4 0 57 0
host_acceptance_rate 1 1.00 2 4 0 85 0
host_neighbourhood 12984 0.38 4 33 0 49 0
host_verifications 0 1.00 2 158 0 261 0
street 0 1.00 10 116 0 1050 0
neighbourhood 4441 0.79 4 15 0 15 0
neighbourhood_cleansed 0 1.00 4 13 0 39 0
city 684 0.97 2 69 0 574 0
state 337 0.98 1 58 0 265 0
zipcode 2102 0.90 1 43 0 379 0
market 0 1.00 7 21 0 3 0
smart_location 0 1.00 6 77 0 628 0
property_type 0 1.00 3 22 0 40 0
room_type 0 1.00 10 15 0 4 0
bed_type 0 1.00 5 13 0 5 0
amenities 0 1.00 2 1297 0 18656 0
weekly_price 19464 0.07 6 10 0 523 0
monthly_price 19547 0.07 7 11 0 533 0
security_deposit 13985 0.33 5 10 0 298 0
cancellation_policy 0 1.00 6 27 0 6 0
prop_type_simplified 0 1.00 5 18 0 5 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
host_since 1 1.00 2009-01-14 2020-06-27 2017-09-05 3002
first_review 10763 0.49 2009-06-01 2020-06-29 2019-04-15 2124
last_review 10763 0.49 2009-06-01 2020-06-29 2020-01-02 1340

Variable type: logical

skim_variable n_missing complete_rate mean count
host_is_superhost 1 1 0.11 FAL: 18697, TRU: 2213
host_has_profile_pic 1 1 1.00 TRU: 20822, FAL: 88
host_identity_verified 1 1 0.16 FAL: 17598, TRU: 3312
neighbourhood_group_cleansed 20911 0 NaN :
is_location_exact 0 1 0.36 FAL: 13316, TRU: 7595
requires_license 0 1 0.00 FAL: 20911
license 20911 0 NaN :
instant_bookable 0 1 0.61 TRU: 12714, FAL: 8197
is_business_travel_ready 0 1 0.00 FAL: 20911
require_guest_profile_picture 0 1 0.01 FAL: 20784, TRU: 127
require_guest_phone_verification 0 1 0.01 FAL: 20740, TRU: 171

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
host_listings_count 1 1.00 27.08 238.43 0.00 1.00 1.00 5.00 3767.00 ▇▁▁▁▁
host_total_listings_count 1 1.00 27.08 238.43 0.00 1.00 1.00 5.00 3767.00 ▇▁▁▁▁
latitude 0 1.00 41.03 0.04 40.81 41.01 41.03 41.05 41.48 ▁▇▁▁▁
longitude 0 1.00 28.98 0.12 28.02 28.97 28.98 29.02 29.91 ▁▁▇▁▁
accommodates 0 1.00 3.16 2.22 1.00 2.00 2.00 4.00 16.00 ▇▁▁▁▁
bathrooms 80 1.00 1.20 1.07 0.00 1.00 1.00 1.00 50.00 ▇▁▁▁▁
bedrooms 156 0.99 1.36 1.48 0.00 1.00 1.00 2.00 50.00 ▇▁▁▁▁
beds 650 0.97 2.02 2.07 0.00 1.00 1.00 2.00 77.00 ▇▁▁▁▁
square_feet 20711 0.01 622.07 1288.95 0.00 95.25 549.00 807.00 17384.00 ▇▁▁▁▁
price 0 1.00 480.94 1965.88 0.00 137.00 247.00 446.00 76922.00 ▇▁▁▁▁
cleaning_fee 0 1.00 48.22 120.08 0.00 0.00 0.00 48.00 4241.00 ▇▁▁▁▁
guests_included 0 1.00 1.40 1.09 1.00 1.00 1.00 1.00 16.00 ▇▁▁▁▁
extra_people 0 1.00 30.63 75.70 0.00 0.00 0.00 40.00 2057.00 ▇▁▁▁▁
minimum_nights 0 1.00 1.54 0.80 1.00 1.00 1.00 2.00 4.00 ▇▃▁▂▁
maximum_nights 0 1.00 103508.86 14850543.83 1.00 59.50 1125.00 1125.00 2147483647.00 ▇▁▁▁▁
minimum_minimum_nights 0 1.00 1.54 0.86 1.00 1.00 1.00 2.00 28.00 ▇▁▁▁▁
maximum_minimum_nights 0 1.00 1.67 4.17 1.00 1.00 1.00 2.00 365.00 ▇▁▁▁▁
minimum_maximum_nights 0 1.00 898.92 7652.64 1.00 360.00 1125.00 1125.00 999999.00 ▇▁▁▁▁
maximum_maximum_nights 0 1.00 899.60 7652.62 1.00 360.00 1125.00 1125.00 999999.00 ▇▁▁▁▁
minimum_nights_avg_ntm 0 1.00 1.58 1.96 1.00 1.00 1.00 2.00 218.90 ▇▁▁▁▁
maximum_nights_avg_ntm 0 1.00 899.40 7652.63 1.00 360.00 1125.00 1125.00 999999.00 ▇▁▁▁▁
availability_30 0 1.00 22.31 12.04 0.00 20.00 29.00 30.00 30.00 ▂▁▁▁▇
availability_60 0 1.00 45.69 23.65 0.00 44.00 59.00 60.00 60.00 ▂▁▁▁▇
availability_90 0 1.00 69.46 34.97 0.00 70.00 89.00 90.00 90.00 ▂▁▁▁▇
availability_365 0 1.00 229.33 146.39 0.00 89.00 319.00 365.00 365.00 ▃▂▂▁▇
number_of_reviews 0 1.00 8.24 24.05 0.00 0.00 0.00 4.00 345.00 ▇▁▁▁▁
number_of_reviews_ltm 0 1.00 3.20 7.72 0.00 0.00 0.00 2.00 108.00 ▇▁▁▁▁
review_scores_rating 11272 0.46 91.17 14.07 20.00 89.00 96.00 100.00 100.00 ▁▁▁▁▇
review_scores_accuracy 11284 0.46 9.28 1.42 2.00 9.00 10.00 10.00 10.00 ▁▁▁▁▇
review_scores_cleanliness 11282 0.46 9.05 1.50 2.00 9.00 10.00 10.00 10.00 ▁▁▁▂▇
review_scores_checkin 11285 0.46 9.52 1.28 2.00 10.00 10.00 10.00 10.00 ▁▁▁▁▇
review_scores_communication 11281 0.46 9.54 1.24 2.00 10.00 10.00 10.00 10.00 ▁▁▁▁▇
review_scores_location 11285 0.46 9.44 1.23 2.00 9.00 10.00 10.00 10.00 ▁▁▁▁▇
review_scores_value 11287 0.46 9.17 1.41 2.00 9.00 10.00 10.00 10.00 ▁▁▁▁▇
calculated_host_listings_count 0 1.00 6.13 17.42 1.00 1.00 2.00 5.00 176.00 ▇▁▁▁▁
calculated_host_listings_count_entire_homes 0 1.00 2.78 6.41 0.00 0.00 1.00 2.00 66.00 ▇▁▁▁▁
calculated_host_listings_count_private_rooms 0 1.00 2.71 16.02 0.00 0.00 1.00 1.00 175.00 ▇▁▁▁▁
calculated_host_listings_count_shared_rooms 0 1.00 0.10 0.64 0.00 0.00 0.00 0.00 11.00 ▇▁▁▁▁
reviews_per_month 10763 0.49 0.73 0.92 0.01 0.13 0.34 0.99 9.20 ▇▁▁▁▁
cost 0 1.00 2026.44 7871.02 0.00 604.00 1068.00 1864.00 307688.00 ▇▁▁▁▁
price_4_nights 0 1.00 2026.44 7871.02 0.00 604.00 1068.00 1864.00 307688.00 ▇▁▁▁▁

We assume that controlling for all other variables that characterise the host will allow us to single out the effect of being a superhost (or not being-).

First, lets create a new variable so that we can adjust for the number of host verifications:

listings_cleaned_f <- listings_cleaned %>%
  mutate(number_of_verifications = length(host_verifications))

Now, we use the following linear model:

mymodelf <- lm(price ~ host_is_superhost + host_total_listings_count + host_has_profile_pic + host_identity_verified + number_of_verifications, data = listings_cleaned_f)
msummary(mymodelf)
##                             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)                 700.1891   203.9621    3.43              0.0006 ***
## host_is_superhostTRUE        95.6749    43.4600    2.20              0.0277 *  
## host_total_listings_count     1.8947     0.0556   34.11 <0.0000000000000002 ***
## host_has_profile_picTRUE   -274.1047   204.4635   -1.34              0.1801    
## host_identity_verifiedTRUE  -48.6547    36.6238   -1.33              0.1840    
## 
## Residual standard error: 1910 on 20905 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.0531, Adjusted R-squared:  0.0529 
## F-statistic:  293 on 4 and 20905 DF,  p-value: <0.0000000000000002

As you can see, we did not take the response time, response rate or neighbourhood of the host into account since these variables are only known for a very limited amount of hosts. We find that the superhosts command a price premium as this is a significant predictor even after controlling for the previously mentionned variables. We observe that in the fact that it has a t value of 2.2 and a Pr(>|t|) of 0.0277. Do note that this model only explains a very small part of the price as we see that Adjusted R-squared is only 0.0529.

5.3 Can the fact that a listing has an exact location be used to predict the price of a four night stay?

Lets investigate whether the fact that a property has or has not listed its exact location is a significant predictor of the price of our four night stay. Using a similar strategy as above, we first looked at the available variables. We suspect we will have to control for the effect of a listing being complete in its information in general so that we will find the effect of specificly having the exact location available. In the following code, we create a variable that is 0 is no summary is available and 1 if a summary is available. We do the same thing for space, description, notes, transit, access and interaction.

listings_cleaned_g <- listings_cleaned %>%
mutate(summary_available = ifelse(is.na(summary), 0, 1), 
       space_available = ifelse(is.na(space), 0, 1),
       description_available = ifelse(is.na(description), 0, 1),
       notes_available = ifelse(is.na(notes), 0, 1),
       transit_available = ifelse(is.na(transit), 0, 1),
       access_available = ifelse(is.na(access), 0, 1),
       interaction_available = ifelse(is.na(interaction), 0, 1))
glimpse(listings_cleaned_g) 
## Rows: 20,911
## Columns: 94
## $ listing_url                                  <chr> "https://www.airbnb.com/…
## $ name                                         <chr> "↪ Istanbul, Your second…
## $ summary                                      <chr> NA, NA, NA, NA, NA, "Hi!…
## $ space                                        <chr> "There are many interest…
## $ description                                  <chr> "There are many interest…
## $ notes                                        <chr> NA, NA, NA, NA, "Please …
## $ transit                                      <chr> "There are bus stops to …
## $ access                                       <chr> NA, NA, NA, NA, "You wil…
## $ interaction                                  <chr> NA, NA, NA, NA, "The are…
## $ house_rules                                  <chr> "Non smoker or (Email hi…
## $ host_since                                   <date> 2010-05-16, 2010-05-25,…
## $ host_response_time                           <chr> "N/A", "N/A", "N/A", "N/…
## $ host_response_rate                           <chr> "N/A", "N/A", "N/A", "N/…
## $ host_acceptance_rate                         <chr> "N/A", "N/A", "N/A", "N/…
## $ host_is_superhost                            <lgl> FALSE, FALSE, FALSE, FAL…
## $ host_neighbourhood                           <chr> "Beyoglu", "Taksim", "Ka…
## $ host_listings_count                          <dbl> 1, 1, 1, 1, 6, 1, 1, 1, …
## $ host_total_listings_count                    <dbl> 1, 1, 1, 1, 6, 1, 1, 1, …
## $ host_verifications                           <chr> "['email', 'phone', 'fac…
## $ host_has_profile_pic                         <lgl> TRUE, TRUE, FALSE, TRUE,…
## $ host_identity_verified                       <lgl> FALSE, TRUE, FALSE, FALS…
## $ street                                       <chr> "Istanbul, Istanbul, Tur…
## $ neighbourhood                                <chr> "Beyoglu", "Beyoglu", "K…
## $ neighbourhood_cleansed                       <chr> "Beyoglu", "Beyoglu", "B…
## $ neighbourhood_group_cleansed                 <lgl> NA, NA, NA, NA, NA, NA, …
## $ city                                         <chr> "Istanbul", "Istanbul Pr…
## $ state                                        <chr> "Istanbul", NA, NA, NA, …
## $ zipcode                                      <chr> "34445", "34433", "34425…
## $ market                                       <chr> "Istanbul", "Istanbul", …
## $ smart_location                               <chr> "Istanbul, Turkey", "Ist…
## $ latitude                                     <dbl> 41.05, 41.03, 41.03, 41.…
## $ longitude                                    <dbl> 28.95, 28.98, 28.98, 29.…
## $ is_location_exact                            <lgl> FALSE, FALSE, FALSE, FAL…
## $ property_type                                <chr> "Apartment", "Apartment"…
## $ room_type                                    <chr> "Private room", "Private…
## $ accommodates                                 <dbl> 3, 2, 1, 2, 3, 1, 1, 2, …
## $ bathrooms                                    <dbl> 1.0, 1.0, 1.0, NA, 1.0, …
## $ bedrooms                                     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, …
## $ beds                                         <dbl> 1, 1, 1, NA, 2, 1, NA, 1…
## $ bed_type                                     <chr> "Real Bed", "Real Bed", …
## $ amenities                                    <chr> "{TV,Wifi,Kitchen,Breakf…
## $ square_feet                                  <dbl> 0, NA, NA, NA, NA, NA, N…
## $ price                                        <dbl> 343, 768, 473, 514, 514,…
## $ weekly_price                                 <chr> "$2,297.00", "$1,108.00"…
## $ monthly_price                                <chr> NA, "$3,077.00", NA, NA,…
## $ security_deposit                             <chr> NA, NA, NA, NA, "$686.00…
## $ cleaning_fee                                 <dbl> 0, 154, 0, 0, 309, 69, 0…
## $ guests_included                              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, …
## $ extra_people                                 <dbl> 274, 0, 0, 0, 0, 0, 0, 1…
## $ minimum_nights                               <dbl> 3, 1, 1, 1, 3, 3, 1, 2, …
## $ maximum_nights                               <dbl> 15, 2, 730, 730, 3, 30, …
## $ minimum_minimum_nights                       <dbl> 3, 1, 1, 1, 3, 3, 1, 2, …
## $ maximum_minimum_nights                       <dbl> 3, 1, 1, 1, 3, 3, 1, 2, …
## $ minimum_maximum_nights                       <dbl> 15, 2, 730, 730, 3, 30, …
## $ maximum_maximum_nights                       <dbl> 15, 2, 730, 730, 3, 30, …
## $ minimum_nights_avg_ntm                       <dbl> 3, 1, 1, 1, 3, 3, 1, 2, …
## $ maximum_nights_avg_ntm                       <dbl> 15, 2, 730, 730, 3, 30, …
## $ availability_30                              <dbl> 29, 30, 30, 0, 25, 30, 3…
## $ availability_60                              <dbl> 59, 60, 60, 0, 55, 60, 6…
## $ availability_90                              <dbl> 89, 90, 90, 0, 85, 90, 9…
## $ availability_365                             <dbl> 364, 365, 365, 0, 360, 3…
## $ number_of_reviews                            <dbl> 0, 1, 0, 0, 9, 0, 0, 0, …
## $ number_of_reviews_ltm                        <dbl> 0, 0, 0, 0, 1, 0, 0, 0, …
## $ first_review                                 <date> NA, 2010-06-14, NA, NA,…
## $ last_review                                  <date> NA, 2010-06-14, NA, NA,…
## $ review_scores_rating                         <dbl> NA, 80, NA, NA, 93, NA, …
## $ review_scores_accuracy                       <dbl> NA, NA, NA, NA, 9, NA, N…
## $ review_scores_cleanliness                    <dbl> NA, NA, NA, NA, 9, NA, N…
## $ review_scores_checkin                        <dbl> NA, NA, NA, NA, 9, NA, N…
## $ review_scores_communication                  <dbl> NA, NA, NA, NA, 9, NA, N…
## $ review_scores_location                       <dbl> NA, NA, NA, NA, 8, NA, N…
## $ review_scores_value                          <dbl> NA, NA, NA, NA, 8, NA, N…
## $ requires_license                             <lgl> FALSE, FALSE, FALSE, FAL…
## $ license                                      <lgl> NA, NA, NA, NA, NA, NA, …
## $ instant_bookable                             <lgl> FALSE, TRUE, FALSE, FALS…
## $ is_business_travel_ready                     <lgl> FALSE, FALSE, FALSE, FAL…
## $ cancellation_policy                          <chr> "strict_14_with_grace_pe…
## $ require_guest_profile_picture                <lgl> FALSE, TRUE, FALSE, FALS…
## $ require_guest_phone_verification             <lgl> FALSE, FALSE, FALSE, FAL…
## $ calculated_host_listings_count               <dbl> 1, 1, 1, 1, 6, 1, 1, 1, …
## $ calculated_host_listings_count_entire_homes  <dbl> 0, 0, 0, 0, 4, 0, 0, 1, …
## $ calculated_host_listings_count_private_rooms <dbl> 1, 1, 1, 1, 1, 1, 1, 0, …
## $ calculated_host_listings_count_shared_rooms  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ reviews_per_month                            <dbl> NA, 0.01, NA, NA, 0.08, …
## $ prop_type_simplified                         <chr> "Apartment", "Apartment"…
## $ cost                                         <dbl> 2468, 3226, 1892, 2056, …
## $ price_4_nights                               <dbl> 2468, 3226, 1892, 2056, …
## $ summary_available                            <dbl> 0, 0, 0, 0, 0, 1, 0, 0, …
## $ space_available                              <dbl> 1, 1, 1, 1, 1, 0, 1, 1, …
## $ description_available                        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, …
## $ notes_available                              <dbl> 0, 0, 0, 0, 1, 0, 0, 0, …
## $ transit_available                            <dbl> 1, 0, 0, 0, 1, 0, 0, 0, …
## $ access_available                             <dbl> 0, 0, 0, 0, 1, 0, 0, 0, …
## $ interaction_available                        <dbl> 0, 0, 0, 0, 1, 0, 0, 0, …

After glancing at our new variables, we see that there is a very strong correlation between them but this should not be a problem when we only want to test for the significance of the exact location and use these variables to control for other factors.

mymodelg <- lm(price_4_nights ~ is_location_exact + summary_available + description_available + notes_available + transit_available + access_available + interaction_available, data=listings_cleaned_g)
msummary(mymodelg)
##                       Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)             2041.5      158.9   12.85 < 0.0000000000000002 ***
## is_location_exactTRUE    458.5      114.0    4.02             0.000058 ***
## summary_available         79.7      287.9    0.28              0.78193    
## description_available    105.2      322.4    0.33              0.74431    
## notes_available           91.9      160.7    0.57              0.56744    
## transit_available       -357.7      148.6   -2.41              0.01605 *  
## access_available         -79.3      156.8   -0.51              0.61321    
## interaction_available   -508.2      153.8   -3.30              0.00096 ***
## 
## Residual standard error: 7860 on 20903 degrees of freedom
## Multiple R-squared:  0.00293,    Adjusted R-squared:  0.0026 
## F-statistic: 8.78 on 7 and 20903 DF,  p-value: 0.0000000000813

We conclude that the availabliity of an exact locatyion of the listing is a significant predictor for the price of our 4 night stay since the t value is 4.19. Again, do note that this model only explains a very small part of the price as we see that Adjusted R-squared is only 0.00248.

5.4 Is location a predictor of the price of our four night stay?

For this question, we will start by grouping the neighbourhoods to divide them into 6 geographical areas: Center (C), North (N), East (E), Far East (FE), West (W), Far West (FW).

listings_cleaned_g <- listings_cleaned %>%
  mutate(neighbourhood_simplified = case_when(neighbourhood_cleansed == "Atasehir" ~ "East",
                                              neighbourhood_cleansed == "Bagcilar" ~ "West",
                                              neighbourhood_cleansed == "Bakirkoy" ~ "West",
                                              neighbourhood_cleansed == "Bayrampasa" ~ "West",
                                              neighbourhood_cleansed == "Beykoz" ~ "North",
                                              neighbourhood_cleansed == "Beyoglu" ~ "Center",
                                              neighbourhood_cleansed == "Catalca" ~ "Far West",
                                              neighbourhood_cleansed == "Esenler" ~ "West",
                                              neighbourhood_cleansed == "Eyup" ~ "North",
                                              neighbourhood_cleansed == "Gaziosmanpasa"~ "West",
                                              neighbourhood_cleansed == "Kadikoy" ~ "East",
                                              neighbourhood_cleansed == "Kartal" ~ "Far East",
                                              neighbourhood_cleansed == "Maltepe" ~ "East",
                                              neighbourhood_cleansed == "Sancaktepe" ~ "Far East",
                                              neighbourhood_cleansed == "Sile" ~ "Far East",
                                              neighbourhood_cleansed == "Sisli" ~ "Center",
                                              neighbourhood_cleansed == "Sultangazi" ~ "Far West",
                                              neighbourhood_cleansed == "Umraniye" ~ "East",
                                              neighbourhood_cleansed == "Zeytinburnu" ~ "West",
                                              neighbourhood_cleansed == "Arnavutkoy" ~ "North",
                                              neighbourhood_cleansed == "Avcilar" ~ "Far West",
                                              neighbourhood_cleansed == "Bahcelievler" ~ "West",
                                              neighbourhood_cleansed == "Basaksehir" ~ "Far West",
                                              neighbourhood_cleansed == "Besiktas" ~ "Center",
                                              neighbourhood_cleansed == "Beylikduzu" ~ "Far West",
                                              neighbourhood_cleansed == "Buyukcekmece" ~ "Far West",
                                              neighbourhood_cleansed == "Cekmekoy" ~ "Far East",
                                              neighbourhood_cleansed == "Esenyurt" ~ "Far West",
                                              neighbourhood_cleansed == "Fatih" ~ "Center",
                                              neighbourhood_cleansed == "Gungoren" ~ "West",
                                              neighbourhood_cleansed == "Kagithane" ~ "North",
                                              neighbourhood_cleansed == "Kucukcekmece" ~ "Far West",
                                              neighbourhood_cleansed == "Pendik" ~ "Far East",
                                              neighbourhood_cleansed == "Sariyer" ~ "North",
                                              neighbourhood_cleansed == "Silivri" ~ "Far West",
                                              neighbourhood_cleansed == "Sultanbeyli" ~ "Far East",
                                              neighbourhood_cleansed == "Tuzla" ~ "Far East",
                                              neighbourhood_cleansed == "Uskudar" ~ "Center")) 

Now, lets create a model that will show us whether the areas we just created are significant predictors of the price of the four night stay. In this model, the estimates will be calculated as the difference between the price in the center and the price in that region. e.g. if the estimate for West would be -500 it would mean the price is estimated to be 500$ lower in the West compared to the center of the city.

mymodelg <- lm(price_4_nights ~ neighbourhood_simplified, data = listings_cleaned_g)
msummary(mymodelg)
##                                  Estimate Std. Error t value
## (Intercept)                        2255.7       67.3   33.54
## neighbourhood_simplifiedEast      -1169.7      162.3   -7.21
## neighbourhood_simplifiedFar East   -667.2      304.8   -2.19
## neighbourhood_simplifiedFar West   -141.5      225.9   -0.63
## neighbourhood_simplifiedNorth      -130.2      233.8   -0.56
## neighbourhood_simplifiedWest       -718.0      275.1   -2.61
##                                              Pr(>|t|)    
## (Intercept)                      < 0.0000000000000002 ***
## neighbourhood_simplifiedEast         0.00000000000059 ***
## neighbourhood_simplifiedFar East               0.0286 *  
## neighbourhood_simplifiedFar West               0.5309    
## neighbourhood_simplifiedNorth                  0.5776    
## neighbourhood_simplifiedWest                   0.0091 ** 
## 
## Residual standard error: 7890 on 20758 degrees of freedom
##   (147 observations deleted due to missingness)
## Multiple R-squared:  0.00279,    Adjusted R-squared:  0.00255 
## F-statistic: 11.6 on 5 and 20758 DF,  p-value: 0.0000000000317

We observe a p-value for our model of 4.63e-12. This means that the area to which a property belongs is indeed a significant predictor of the price of our four night stay. As we suspected, the properties in the center of the city are the most expensive for us to stay at for four nights.

5.5 What is the effect of the cancellation policy on the price of our four night stay?

For this question, we will investigate whether the cancellation policy has an effect on the price of our four night stay. Using a similar strategy as for the previous questions, we first looked at the available variables.

mymodeli <- lm(price_4_nights ~ cancellation_policy + bedrooms +  bathrooms + accommodates, data = listings_cleaned)
summary(mymodeli) 
## 
## Call:
## lm(formula = price_4_nights ~ cancellation_policy + bedrooms + 
##     bathrooms + accommodates, data = listings_cleaned)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -16835  -1124   -693   -114 306202 
## 
## Coefficients:
##                                                Estimate Std. Error t value
## (Intercept)                                       418.0      105.6    3.96
## cancellation_policymoderate                      -307.0      145.9   -2.10
## cancellation_policystrict                        -342.3     7840.4   -0.04
## cancellation_policystrict_14_with_grace_period    425.2      147.7    2.88
## cancellation_policysuper_strict_30               1072.2     2176.0    0.49
## cancellation_policysuper_strict_60                818.2     7840.4    0.10
## bedrooms                                         -135.8       55.6   -2.44
## bathrooms                                         334.9       72.2    4.64
## accommodates                                      434.3       28.8   15.11
##                                                            Pr(>|t|)    
## (Intercept)                                               0.0000757 ***
## cancellation_policymoderate                                   0.035 *  
## cancellation_policystrict                                     0.965    
## cancellation_policystrict_14_with_grace_period                0.004 ** 
## cancellation_policysuper_strict_30                            0.622    
## cancellation_policysuper_strict_60                            0.917    
## bedrooms                                                      0.015 *  
## bathrooms                                                 0.0000036 ***
## accommodates                                   < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7840 on 20677 degrees of freedom
##   (225 observations deleted due to missingness)
## Multiple R-squared:  0.0186, Adjusted R-squared:  0.0182 
## F-statistic:   49 on 8 and 20677 DF,  p-value: <0.0000000000000002

We see that cancellation_policy indeed is a strong explanatory variable for the price for a 4-night stay in Istanbul. We see that the cancellation policy of a listing negatively affects the price for four nights. If a listing has a moderate cancellation policy, the price for four nights is lowered by $307

If a listing has a strict policy with a 14 days grace period, the listing price is lowered by $342

5.6 Diagnostics, collinearity, summary tables

5.6.1 Checking residuals of all the models

autoplot(model1)

autoplot(model2)

autoplot(mymodela)

autoplot(mymodelb)

autoplot(mymodelc)

autoplot(mymodeld)

autoplot(mymodele)

autoplot(mymodelf)

autoplot(mymodelg)

autoplot(mymodeli)

5.6.2 Finding the Variance Inflation Factor

5.6.2.1 Model 1

vif(model1) %>%
  kable() %>%
  kable_styling()
GVIF Df GVIF^(1/(2*Df))
number_of_reviews 1.014 1 1.007
prop_type_simplified 1.025 4 1.003
review_scores_rating 1.023 1 1.012

Model 1 looks OK regarding VIF-scores.

5.6.2.2 Model 2

vif(model2) %>%
  kable() %>%
  kable_styling()
GVIF Df GVIF^(1/(2*Df))
number_of_reviews 1.039 1 1.020
room_type 1.401 3 1.058
prop_type_simplified 1.390 4 1.042
review_scores_rating 1.026 1 1.013

Model 2 looks OK regarding VIF-scores.

5.6.2.3 Model A - D

Only one explanatory variable for these models hence no need to check for multicollinearity.

5.6.2.4 Model E

vif(mymodele) %>%
  kable() %>%
  kable_styling()
x
bathrooms 2.065
bedrooms 2.431
beds 1.884
accommodates 1.685

Again a rather low VIF-score across the board.

5.6.2.5 Model F

#vif(mymodelf) %>%
#  kable() %>%
#  kable_styling()

VIF throws an error here saying there’s atleast one aliased coefficient in the model, meaning that they share a perfect multicollinearity. Let’s find out which ones those are:

alias(mymodelf) 
## Model :
## price ~ host_is_superhost + host_total_listings_count + host_has_profile_pic + 
##     host_identity_verified + number_of_verifications
## 
## Complete :
##                         (Intercept) host_is_superhostTRUE
## number_of_verifications 20911           0                
##                         host_total_listings_count host_has_profile_picTRUE
## number_of_verifications     0                         0                   
##                         host_identity_verifiedTRUE
## number_of_verifications     0

It seems that number_of_verifications is our culprit, let’s run the regression without it.

modelf_adjusted <- lm(price ~ host_is_superhost + host_total_listings_count + host_has_profile_pic + host_identity_verified, data = listings_cleaned_f)

vif(modelf_adjusted) %>%
  kable() %>%
  kable_styling()
x
host_is_superhost 1.021
host_total_listings_count 1.002
host_has_profile_pic 1.001
host_identity_verified 1.021

This seem to have solved the problem, now let’s see if our regression tells us something novel.

summary(modelf_adjusted) 
## 
## Call:
## lm(formula = price ~ host_is_superhost + host_total_listings_count + 
##     host_has_profile_pic + host_identity_verified, data = listings_cleaned_f)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -2884   -300   -195    -17  76492 
## 
## Coefficients:
##                             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)                 700.1891   203.9621    3.43              0.0006 ***
## host_is_superhostTRUE        95.6749    43.4600    2.20              0.0277 *  
## host_total_listings_count     1.8947     0.0556   34.11 <0.0000000000000002 ***
## host_has_profile_picTRUE   -274.1047   204.4635   -1.34              0.1801    
## host_identity_verifiedTRUE  -48.6547    36.6238   -1.33              0.1840    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1910 on 20905 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.0531, Adjusted R-squared:  0.0529 
## F-statistic:  293 on 4 and 20905 DF,  p-value: <0.0000000000000002

We still find that there indeed is a price premium if a host is a super-host. Furthermore, we also still see that the total listings count have an impact on the price (premium). R-squared is still 0.0529, so removing the variable did not improve our model.

5.6.2.6 Model G

Fewer than 2 terms so no use in VIF

5.6.2.7 Model I

vif(mymodeli) %>%
  kable() %>%
  kable_styling()
GVIF Df GVIF^(1/(2*Df))
cancellation_policy 1.046 5 1.004
bedrooms 2.286 1 1.512
bathrooms 2.011 1 1.418
accommodates 1.378 1 1.174

No problems here either.

5.6.3 Summary Tables (huxtable)

huxreg(list(
  "Model 1" = model1, 
  "Model 2" = model2, 
  "Model A" = mymodela, 
  "Model B" = mymodelb, 
  "Model C" = mymodelc, 
  "Model D" = mymodeld, 
  "Model E" = mymodele, 
  "Model F" = modelf_adjusted, 
  "Model G" = mymodelg, 
  "Model I" = mymodeli), statistics = c
       ('#observations' = 'nobs', 
        'R squared' = 'r.squared', 
        'Adj. R Squared' = 'adj.r.squared', 
        'Residual SE' = 'sigma'), 
bold_signif = 0.05
) %>%
  kable() %>%
  kable_styling()
names Model 1 Model 2 Model A Model B Model C Model D Model E Model F Model G Model I
Model 1 Model 2 Model A Model B Model C Model D Model E Model F Model G Model I
1 (Intercept) 2665.80373722167 *** 2863.31649679719 *** 1373.56304760703 *** 1543.52597283389 *** 1338.84199596149 *** 570.979500460525 *** 409.195756322163 *** 700.189143881403 *** 2255.7496910215 *** 417.974176232004 ***
2 (493.831198874327) (495.276529084339) (81.8730132989876) (74.3094507813971) (74.6520376499834) (93.877103151643) (99.5605282824683) (203.962120603776) (67.2501312358082) (105.591931170869)
3 number_of_reviews -0.74365691852765 -3.00811449064875
4 (2.23829568286529) (2.2615722733781)
5 prop_type_simplifiedBoutique hotel 452.911525981365 381.814989939969
6 (327.261470219716) (366.797886639531)
7 prop_type_simplifiedHouse -21.0310807257259 48.5474398256521
8 (322.411611960802) (321.977894288409)
9 prop_type_simplifiedOther 246.776206573285 276.954622437153
10 (208.226775099041) (216.603865928541)
11 prop_type_simplifiedServiced apartment 439.449031129747 173.57496327442
12 (280.230585672306) (285.190623768192)
13 review_scores_rating -10.7719725200272 * -9.33729991559724
14 (5.26067880215465) (5.25629789585014)
15 room_typeHotel room 782.088517178091 *
16 (374.010970820755)
17 room_typePrivate room -879.566661877273 ***
18 (165.55993892689)
19 room_typeShared room -1460.05389137127 *
20 (638.071183545976)
21 bathrooms 545.579120408411 *** 338.559250180018 *** 334.946276051673 ***
22 (51.0682019480521) (71.0075140053) (72.2415497478511)
23 bedrooms 360.15800509051 *** -120.357557938856 * -135.765654410497 *
24 (36.9384530066663) (55.7965854005025) (55.5629207170478)
25 beds 315.063166987962 *** 8.27821994414329
26 (25.8195537938598) (35.4107986206482)
27 accommodates 460.635680255991 *** 412.016941287965 *** 434.304439344051 ***
28 (24.3100997387693) (31.0448438353153) (28.7518162714076)
29 host_is_superhostTRUE 95.6748925909414 *
30 (43.4600415702343)
31 host_total_listings_count 1.89471350281978 ***
32 (0.0555547777010792)
33 host_has_profile_picTRUE -274.104682752846
34 (204.463456240145)
35 host_identity_verifiedTRUE -48.6547439728043
36 (36.6238421121075)
37 neighbourhood_simplifiedEast -1169.74232776797 ***
38 (162.279614900289)
39 neighbourhood_simplifiedFar East -667.214179657748 *
40 (304.77261085548)
41 neighbourhood_simplifiedFar West -141.533697000572
42 (225.867042623846)
43 neighbourhood_simplifiedNorth -130.188852987599
44 (233.773328969641)
45 neighbourhood_simplifiedWest -718.028867222633 **
46 (275.134302616739)
47 cancellation_policymoderate -306.981032481855 *
48 (145.905660066239)
49 cancellation_policystrict -342.290538535808
50 (7840.43795726366)
51 cancellation_policystrict_14_with_grace_period 425.158931840075 **
52 (147.715379644877)
53 cancellation_policysuper_strict_30 1072.203229539
54 (2175.98854205583)
55 cancellation_policysuper_strict_60 818.236323438462
56 (7840.35438340087)
1.1 #observations 9639 9639 20831 20755 20261 20911 20114 20910 20764 20686
2.1 R squared 0.00110229510886997 0.00566526329785218 0.0054497010682834 0.00455998011420604 0.00729624591033085 0.0168816238634554 0.0176374108553016 0.0530675565870543 0.00278923205362719 0.0186233842383247
3.1 Adj. R Squared 0.000480058166454245 0.00473588198823349 0.00540195272227861 0.00451201403605406 0.00724724528077902 0.0168346049540797 0.0174420033086021 0.0528863688437561 0.00254903290921371 0.0182436863650309
4.1 Residual SE 7186.16431911236 7170.84913021399 7864.41963946265 7881.82196146728 7596.76614539442 7804.48253368504 7584.38521536803 1913.23437804512 7887.21053226126 7840.0267645568
.1 *** p < 0.001; ** p < 0.01; * p < 0.05.

5.6.4 Our best model

Looking at our huxtable, we see that Model F seems to have the best explanatory power for staying at an AirBnB for four nights in Istanbul. However, it’s important to note that all of our models have a very poor prediction power…

tidy(modelf_adjusted) %>%
  kable() %>%
  kable_styling()
term estimate std.error statistic p.value
(Intercept) 700.189 203.9621 3.433 0.0006
host_is_superhostTRUE 95.675 43.4600 2.201 0.0277
host_total_listings_count 1.895 0.0556 34.105 0.0000
host_has_profile_picTRUE -274.105 204.4635 -1.341 0.1801
host_identity_verifiedTRUE -48.655 36.6238 -1.329 0.1840
glance(modelf_adjusted) %>%
  kable() %>%
  kable_styling()
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual nobs
0.0531 0.0529 1913 292.9 0 4 -187675 375362 375410 76522037242 20905 20910

That is:

Model: 700.189+95.675(host_is_superhostTRUE)+1.895(host_total_listings_count)-274.105(host_has_profile_picTRUE)-48.655(host_identity_verifiedTRUE)

Following the filter criteria set forth in the problem, we predict the cost:

final_prediction <- listings_cleaned %>%
filter(prop_type_simplified == "Apartment", room_type == "Private room", number_of_reviews >= 10, review_scores_rating > 90)  %>%
  group_by(listing_url) %>%
  mutate(host_is_superhost = case_when(host_is_superhost == TRUE ~ 95.675,
                                       host_is_superhost == FALSE ~ 0)) %>%
  mutate(host_total_listings_count = case_when(host_total_listings_count > 0 ~ count(host_total_listings_count) * 1.84,
                                               host_total_listings_count == 0 ~ 0)) %>%
  mutate(host_has_profile_pic = case_when(host_has_profile_pic == TRUE ~ -478.568,
                                          host_has_profile_pic == FALSE ~ 0)) %>%
  mutate(host_identity_verified = case_when(host_identity_verified == TRUE ~ -48.655,
                                            host_identity_verified == FALSE ~ 0)) %>%
  summarise(predicted_cost = sum(700.189 + host_is_superhost + host_total_listings_count + host_has_profile_pic + host_identity_verified))

We see that the total cost of staying at an AirBnB in Istanbul for four days fitting the criteria set forth above varies from: * Lowest price: 377.4 (found using slice()) * Highest price: 523.7

From our regression model we know that there is a standard error for every explanatory variable. Let’s find the 95% confidence interval for the above prices:

LOWER_final_prediction <- listings_cleaned %>%
filter(prop_type_simplified == "Apartment", room_type == "Private room", number_of_reviews >= 10, review_scores_rating > 90)  %>%
  group_by(listing_url) %>%
  mutate(host_is_superhost = case_when(host_is_superhost == TRUE ~ 52.215, #Minus the standard error for every variable 
                                       host_is_superhost == FALSE ~ 0)) %>%
  mutate(host_total_listings_count = case_when(host_total_listings_count > 0 ~ count(host_total_listings_count) * 1.895,
                                               host_total_listings_count == 0 ~ 0)) %>%
  mutate(host_has_profile_pic = case_when(host_has_profile_pic == TRUE ~ -274.105,
                                          host_has_profile_pic == FALSE ~ 0)) %>%
  mutate(host_identity_verified = case_when(host_identity_verified == TRUE ~ -85.278,
                                            host_identity_verified == FALSE ~ 0)) %>%
  summarise(LOWER_predicted_cost = sum(496.227 + host_is_superhost + host_total_listings_count + host_has_profile_pic + host_identity_verified)) 
HIGHER_final_prediction <- listings_cleaned %>%
filter(prop_type_simplified == "Apartment", room_type == "Private room", number_of_reviews >= 10, review_scores_rating > 90)  %>%
  group_by(listing_url) %>%
  mutate(host_is_superhost = case_when(host_is_superhost == TRUE ~ 139.135,
                                       host_is_superhost == FALSE ~ 0)) %>%
  mutate(host_total_listings_count = case_when(host_total_listings_count > 0 ~ count(host_total_listings_count) * 1.95,
                                               host_total_listings_count == 0 ~ 0)) %>%
  mutate(host_has_profile_pic = case_when(host_has_profile_pic == TRUE ~ -69.642,
                                          host_has_profile_pic == FALSE ~ 0)) %>%
  mutate(host_identity_verified = case_when(host_identity_verified == TRUE ~ -12.032,
                                            host_identity_verified == FALSE ~ 0)) %>%
  summarise(predicted_cost = sum(904.151 + host_is_superhost + host_total_listings_count + host_has_profile_pic + host_identity_verified))

To conclude, we therefore see that:

5.6.4.1 Predicted Price (Point)

  • Lowest price: 377.4 (found using slice())
  • Highest price: 523.7

5.6.4.2 Intervals

For our minimum and maximum price we see that the confidence interval set gives us: * minimum price with 95% confidence = 136.8 to 822.5 * maximum price with 95% confidence = 276.2 to 975.6

The wide price range tells us that our model is poor at predicting the price. However, this model F had the highest R-squared value out of the possible models.